import numpy as np  
mat = np.array([[1,0,0],[-1,3,3],[1,2,2]]) 
rank = np.linalg.matrix_rank(mat)
cond = np.linalg.cond(mat)
print ("rank=", rank, " cond=", cond)

rank= 2  cond= 5.345097829566925e+17


import numpy as np  
mat = np.array([[1,0,0],[-1,3,3],[1,2,2]]) 
inv = np.linalg.pinv(mat)
print (inv)
np.dot(inv, mat)

[[ 0.34210526 -0.26315789  0.39473684]
 [ 0.01315789  0.10526316  0.09210526]
 [ 0.01315789  0.10526316  0.09210526]]

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.16333634e-17, 5.00000000e-01, 5.00000000e-01],
       [4.16333634e-17, 5.00000000e-01, 5.00000000e-01]])


import numpy as np
import matplotlib.pyplot as plt


rng=np.random.RandomState(3)
np.random.seed(543)

x=np.arange(5,30)/3+0.2*np.random.randn(25)+5
y=2*x-0.5*np.square(x-10)+3*np.sin(x+0.5)+1+0.5*np.random.randn(25)
y[::5]+=50*(0.5-rng.rand(5))
plt.scatter(x, y, s=30, c='red', marker='+')
plt.show()


x, y

(array([ 6.56347837,  6.90468638,  7.36155461,  7.61217254,  7.83663237,
         8.50572885,  8.82459789,  8.99898214,  9.23107005,  9.46395953,
         9.9823754 , 10.42786634, 10.16821181, 11.04209398, 11.53256652,
        11.40559324, 12.32589701, 12.54873551, 13.09798103, 13.46266847,
        13.56169645, 13.90542475, 14.31070954, 14.12825122, 14.69545437]),
 array([ 9.17073128, 12.36943984, 14.41253873, 15.32495012, 16.90494901,
         8.19381999, 17.87332467, 18.00769794, 19.19394809, 18.49061105,
        29.17813602, 17.98533015, 18.50208979, 20.35720439, 21.2260356 ,
        20.27359827, 23.94399823, 23.66264186, 24.12110209, 25.83297486,
         5.94216416, 23.96504774, 21.94770807, 23.52732822, 20.20772171]))


new_x=np.stack([x,np.ones(len(x))],axis=1)
new_x

array([[ 6.56347837,  1.        ],
       [ 6.90468638,  1.        ],
       [ 7.36155461,  1.        ],
       [ 7.61217254,  1.        ],
       [ 7.83663237,  1.        ],
       [ 8.50572885,  1.        ],
       [ 8.82459789,  1.        ],
       [ 8.99898214,  1.        ],
       [ 9.23107005,  1.        ],
       [ 9.46395953,  1.        ],
       [ 9.9823754 ,  1.        ],
       [10.42786634,  1.        ],
       [10.16821181,  1.        ],
       [11.04209398,  1.        ],
       [11.53256652,  1.        ],
       [11.40559324,  1.        ],
       [12.32589701,  1.        ],
       [12.54873551,  1.        ],
       [13.09798103,  1.        ],
       [13.46266847,  1.        ],
       [13.56169645,  1.        ],
       [13.90542475,  1.        ],
       [14.31070954,  1.        ],
       [14.12825122,  1.        ],
       [14.69545437,  1.        ]])


np.transpose(new_x).dot(new_x)

array([[3026.92222155,  267.89838837],
       [ 267.89838837,   25.        ]])


inv=np.linalg.inv(np.transpose(new_x).dot(new_x)).astype(np.float64)
inv

array([[ 0.00640449, -0.06863014],
       [-0.06863014,  0.7754362 ]])


w=inv.dot(np.transpose(new_x).dot(y))
w

array([1.15421245, 6.45613743])


y_est=new_x.dot(w)    
error=np.mean(np.square(y-y_est))
print('\nMean Squared Error for training set:%.8f' %error)

Mean Squared Error for training set:21.93024489


plt.scatter(x, y, s=30, c='red', marker='+')
plt.plot(x, new_x.dot(w), c='blue')
plt.show()


a = np.transpose(new_x).dot(new_x)
b = np.transpose(new_x).dot(y)
w = np.linalg.solve(a,b)
w

array([1.15421245, 6.45613743])


%%timeit
w1 = np.linalg.inv(np.transpose(new_x).dot(new_x)).dot(np.transpose(new_x).dot(y))

The slowest run took 19.69 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 5: 24.3 µs per loop


%%timeit
w2 =  np.linalg.solve(np.transpose(new_x).dot(new_x), np.transpose(new_x).dot(y))

The slowest run took 9.87 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 5: 14.6 µs per loop


(q, r) = np.linalg.qr(new_x)
a = r
b = np.transpose(q).dot(y)
w = np.linalg.solve(a,b)
w

array([1.15421245, 6.45613743])


cond1 = np.linalg.cond(np.transpose(new_x).dot(new_x))
cond2 = np.linalg.cond(r)
cond1, cond2

(2384.116567408285, 48.82741614511562)


w=np.array([1,1])
loss_list=[]
w_list=[]
b_list=[]
w_grad_list=[]
b_grad_list=[]
rate=0.001
for i in range(100000):
    w_list.append(w)
    y_est=new_x.dot(w)
    
    loss=np.mean(np.square(y-y_est))
            
    loss_list.append(loss)
    
    w_grad=2*(y_est-y).dot(new_x)/len(new_x)
    w=w-rate*w_grad
    w_grad_list.append(w_grad)


loss_list[:5]+loss_list[-5:]

[72.61193759936884,
 51.53705277576042,
 39.49350369915836,
 32.61098612822289,
 28.677782706722965,
 21.93024489577661,
 21.93024489577622,
 21.93024489577582,
 21.930244895775427,
 21.930244895775033]


plt.scatter(range(1,len(loss_list[:50])+1),loss_list[:50], s=30, c='red', marker='+')
plt.show()


w_grad_list[:5], w_grad_list[-5:]

([array([-154.27836215,  -14.21733628]),
  array([-116.61456982,  -10.88242764]),
  array([-88.14268041,  -8.36139436]),
  array([-66.61939705,  -6.45560901]),
  array([-50.3489026 ,  -5.01491947])],
 [array([ 1.75399006e-06, -1.98095519e-05]),
  array([ 1.75381050e-06, -1.98075241e-05]),
  array([ 1.75363099e-06, -1.98054965e-05]),
  array([ 1.75345147e-06, -1.98034691e-05]),
  array([ 1.75327198e-06, -1.98014419e-05])])


w_list[:5], w_list[-5:]

([array([1, 1]),
  array([1.15427836, 1.01421734]),
  array([1.27089293, 1.02509976]),
  array([1.35903561, 1.03346116]),
  array([1.42565501, 1.03991677])],
 [array([1.15422959, 6.45594391]),
  array([1.15422959, 6.45594393]),
  array([1.15422959, 6.45594395]),
  array([1.15422958, 6.45594397]),
  array([1.15422958, 6.45594399])])

w

array([1.15422958, 6.45594401])


plt.scatter(x, y, s=30, c='red', marker='+')
plt.plot(x, new_x.dot(w), c='blue')
plt.show()


rng=np.random.RandomState(3)
np.random.seed(543)

x=np.arange(5,25,0.1)
y=5+10*np.random.randn(200)
s=x+0.1*np.random.randn(200)
t=-5-10*np.random.randn(200)

z=2*x+y+3*s+1e-6*t+2*np.random.randn(200)
z[::20]+=10*(0.5-rng.rand(len(z[::20])))

X=np.stack([x,y,s,t],axis=1)

from sklearn.model_selection import train_test_split

train_input, test_input, train_label, test_label = train_test_split(X,z, test_size=0.5)


inv=np.linalg.inv(np.transpose(train_input).dot(train_input))

w=inv.dot(np.transpose(train_input).dot(train_label))

print('Weight vector is')
print(w)

pred=train_input.dot(w)    
error=np.mean(np.square(train_label-pred))
print('\nMean Squared Error for training set:%.8f' %error)

pred=test_input.dot(w)    
error=np.mean(np.square(test_label-pred))
print('\nMean Squared Error for testing set:%.8f' %error)

Weight vector is
[ 6.34999142  1.0003759  -1.35219666  0.01582614]

Mean Squared Error for training set:4.06102838

Mean Squared Error for testing set:4.61530081


def solve_by_gradient_descent(X,y,rate=0.001,l2=0,l1=0,early_drop=True,num_epochs=10000):
  """solve a Xw->y equation by gradient descent and return the solution"""
  print('Solving by gradient descent using \nL2-regularization penalty=%.4f \nL1-regularization penalty=%.4f\n' %(l2,l1))
  w=np.ones(X.shape[1])

  for i in range(num_epochs):
      
      y_est=X.dot(w)
      w_grad=2*(y_est-y).dot(X)/len(X) +2*l2*w + l1*np.sign(w)
      w=w-rate*w_grad

  loss=np.mean(np.square(y-y_est)) + l2*np.sum(np.square(w)) + l1*np.sum(np.abs(w))
  print('Last loss calculated')
  print(loss)

  print('\nLast gradient vector')
  print(w_grad)

  print('\nFinal value of weight vector w:')
  print(w)

  y_est=X.dot(w)    
  error=np.mean(np.square(y-y_est))
  print('\nMean Squared Error for training set:%.8f' %error)

  return w


w=solve_by_gradient_descent(train_input,train_label,early_drop=False,rate=0.0008,num_epochs=800000)
pred=test_input.dot(w)    
error=np.mean(np.square(test_label-pred))
print('\nMean Squared Error for testing set:%.8f' %error)

Solving by gradient descent using 
L2-regularization penalty=0.0000 
L1-regularization penalty=0.0000

Last loss calculated
4.061028604908347

Last gradient vector
[-4.85360505e-05  7.80602743e-08  4.85037157e-05  5.11682177e-08]

Final value of weight vector w:
[ 6.34537362  1.00038333 -1.34758194  0.01583101]

Mean Squared Error for training set:4.06102860

Mean Squared Error for testing set:4.61494387


l=0.5
inv=np.linalg.inv(np.transpose(train_input).dot(train_input)+len(train_input)*l*np.eye(train_input.shape[1]))

w=inv.dot(np.transpose(train_input).dot(train_label))
print(w)

pred=train_input.dot(w)    
error=np.mean(np.square(train_label-pred))
print('\nMean Squared Error for training set:%.8f' %error)

pred=test_input.dot(w)    
error=np.mean(np.square(test_label-pred))

print('\nMean Squared Error for testing set:%.8f' %error)

[2.53511683 1.004805   2.45503268 0.01672721]

Mean Squared Error for training set:4.22093582

Mean Squared Error for testing set:4.46394639


w=solve_by_gradient_descent(train_input,train_label,early_drop=False,rate=0.0008,num_epochs=400000,l2=0.5)
pred=test_input.dot(w)    
error=np.mean(np.square(test_label-pred))
print('\nMean Squared Error for testing set:%.8f' %error)

Solving by gradient descent using 
L2-regularization penalty=0.5000 
L1-regularization penalty=0.0000

Last loss calculated
10.952893661577539

Last gradient vector
[-2.43360887e-13  8.28226376e-14  2.64233080e-13 -1.52308721e-15]

Final value of weight vector w:
[2.53511683 1.004805   2.45503268 0.01672721]

Mean Squared Error for training set:4.22093582

Mean Squared Error for testing set:4.46394639


w=solve_by_gradient_descent(train_input,train_label,early_drop=False,rate=0.0008,num_epochs=400000,l1=2)
pred=test_input.dot(w)    
error=np.mean(np.square(test_label-pred))
print('\nMean Squared Error for testing set:%.8f' %error)

Solving by gradient descent using 
L2-regularization penalty=0.0000 
L1-regularization penalty=2.0000

Last loss calculated
16.090378483244212

Last gradient vector
[-1.62531677e-02  2.60706725e-05  1.62424026e-02  1.72649139e-05]

Final value of weight vector w:
[4.99103690e+00 9.96157412e-01 5.54013947e-04 5.57531970e-03]

Mean Squared Error for training set:4.10373083

Mean Squared Error for testing set:4.47741986


w=solve_by_gradient_descent(train_input,train_label,early_drop=False,rate=0.0001,num_epochs=800000,l1=2,l2=0.5)
pred=test_input.dot(w)
error=np.mean(np.square(test_label-pred))
print('\nMean Squared Error for testing set:%.8f' %error)

Solving by gradient descent using 
L2-regularization penalty=0.5000 
L1-regularization penalty=2.0000

Last loss calculated
22.952967690877948

Last gradient vector
[-2.20623519e-12  5.04929432e-13  2.05524486e-12 -1.33226763e-15]

Final value of weight vector w:
[2.5344403  0.99842602 2.45042448 0.00510151]

Mean Squared Error for training set:4.26375892

Mean Squared Error for testing set:4.42632330

Introduction¶

Linear Regression¶

Bias term¶

Squared Loss¶

Optimal Parameter¶

Coding preparation¶

Example 1¶

Alternative approach 1 to tackle the problem: LU decomposition¶

Alternative approach 2 to tackle the problem: QR decomposition¶

Alternative approach 3 to tackle the problem: gradient descent¶

Solution using gradient descent¶

Example 2¶

L2-regularization¶

Closed form solution for Dataset 2¶

Gradient descent with L2-regularization for Dataset 2¶

Why should we use L2 regularization?¶

L1-regularization (LASSO Regression)¶

Gradient Descent with L1-regularization for Dataset 2¶

Why use L1 regularization?¶

Elastic Net¶

Gradient Descent with Elastic Net for Dataset 2¶